Analysis of ARPAV Temperature Data

In this project, we will examine temperature data coming from ARPAV (Agenzia Regionale per la Protezione dell’Ambiente - Veneto) stations over the Veneto region. Most of the analysis pertains to the following locations:

Station Latitude Longitude Height above sea level
Auronzo di Cadore 46°33′33″ N 12°25′28″ E 887 m
Castelfranco Veneto 45°40′00″ N 11°55′00″ E 46 m
Cavallino-Treporti 45°27′31″ N 12°29′11″ E 1 m
Malo 45°40′10″ N 11°27′52″ E 98 m
Porto Tolle 44°56′58″ N 12°19′28″ E -22 m
Roverchiara 45°16′10″ N 11°14′41″ E 20 m

We have access to daily temperature records (minimum, average and maximum) in different temporal windows over the past three or four decades (see later for details).

The main goal of the project is to investigate the temperature trend over the years, as recorded by these stations, and test if there is a general increase in temperature, compatibly with the climate change.

Data import and pre-processing

First, we inspect the datasets with a function that does minimal manipulations; the data is stored in csv files holding daily temperatures records in each row.

get_daily_data <- function(station) {
    require(readr)

    fname <- paste0("./data/daily/", station, ".csv")

    read_csv(
        fname,
        col_names = c("date", "min", "avg", "max"),
        skip = 1,
        show_col_types = FALSE
    ) |> na.omit()
}
get_daily_data("malo") |>
    head() |>
    kableExtra::kbl() |>
    kableExtra::kable_styling(full_width = FALSE)
date min avg max
1992-04-01 5.0 7.3896 10.7
1992-04-03 5.0 7.9406 10.2
1992-04-04 8.2 10.1427 12.1
1992-04-05 7.9 9.9563 11.3
1992-04-06 8.3 9.3021 11.9
1992-04-07 7.7 10.8646 15.1
get_daily_data("malo") |>
    ggplot(aes(x = date)) +
    geom_point(aes(y = min, colour = "min"), size = 0.1) +
    geom_point(aes(y = max, colour = "max"), size = 0.1) +
    geom_point(aes(y = avg, colour = "avg"), size = 0.1) +
    scale_colour_manual(
        values = my_pal |> set_names("min", "avg", "max"),
        labels = list(min = "Minima", avg = "Averages", max = "Maxima")
    ) +
    guides(colour = guide_legend(override.aes = list(size = 3))) +
    labs(
        x = "Date", y = "Temperature (°C)",
        colour = element_blank(),
        title = "Evolution of daily temperatures in Malo"
    )

To smooth things out, we can average over each month, still keeping separate the minima, averages and maxima.

get_monthly_data <- function(station) {
    require(dplyr)
    require(lubridate)

    get_daily_data(station) |>
        mutate(
            year = year(.data$date),
            month = month(.data$date, label = TRUE)
        ) |>
        group_by(year, month) |>
        summarize(
            min = mean(.data$min),
            avg = mean(.data$avg),
            max = mean(.data$max)
        )
}
get_monthly_data("malo") |>
    mutate(date = make_date(year = year, month = as.numeric(month))) |>
    ggplot(aes(x = date)) +
    geom_ribbon(
        aes(ymin = min, ymax = max),
        fill = my_pal[2], alpha = 0.5
    ) +
    geom_line(
        aes(y = avg), colour = my_pal[2], linewidth = 0.8
    ) +
    labs(
        x = "Date", y = "Temperature (°C)",
        title = "Evolution of monthly temperatures in Malo"
    )

Now, we present a different visualization of the monthly dataset, where we subtract from each record a baseline made by averaging over the first 8 years. Also, we will plot the average monthly thermal excursion (average of the maxima minus average of the minima), subject to the same baseline subtraction.

print_station <- function(station) {
    station |>
        str_split_1("_") |>
        paste(collapse = " ") |>
        str_to_title()
}

temp_change_heatmaps <- function(station, avg_window) {
    require(forcats)
    require(ggplot2)
    require(stringr)

    data <- get_monthly_data(station)

    years <- unique(data$year)

    baseline <- data |>
        filter(.data$year <= years[avg_window]) |>
        group_by(.data$month) |>
        summarize(
            min = mean(.data$min),
            avg = mean(.data$avg),
            max = mean(.data$max)
        )

    station_name <- paste(print_station(station), "station")

    fill_pal <- wesanderson::wes_palette("Zissou1", 10, "continuous")

    temperature_plt <- data |>
        inner_join(baseline, by = "month") |>
        mutate(avg = .data$avg.x - .data$avg.y) |>
        ggplot() +
            geom_tile(
                aes(
                    x = .data$year,
                    y = fct_rev(.data$month),
                    fill = .data$avg
                )
            ) +
            annotate(
                geom = "rect",
                xmin = years[1] - 0.5,
                xmax = years[avg_window] + 0.5,
                ymin = 12.5, ymax = 0.5,
                colour = "grey20", fill = NA,
                linewidth = 0.8
            ) +
            scale_x_continuous(
                breaks = scales::pretty_breaks(),
                expand = c(0, 0)
            ) +
            scale_fill_gradientn(colours = fill_pal) +
            labs(
                x = "Year", y = "Month", fill = "Deviation (°C)",
                title = paste0(
                    "Deviation of the average monthly ",
                    "temperature w.r.t. the ",
                    years[1], "-", years[avg_window], " period"
                ),
                subtitle = station_name
            )

    excursion_plt <- data |>
        inner_join(baseline, by = "month") |>
        mutate(
            diff = .data$max.x - .data$min.x - .data$max.y + .data$min.y
        ) |>
        ggplot() +
            geom_tile(
                aes(
                    x = .data$year, y = fct_rev(.data$month),
                    fill = .data$diff
                )
            ) +
            annotate(
                geom = "rect",
                xmin = years[1] - 0.5, xmax = years[avg_window] + 0.5,
                ymin = 12.5, ymax = 0.5,
                colour = "grey20", fill = NA,
                linewidth = 0.8
            ) +
            scale_fill_gradientn(colours = fill_pal) +
            labs(
                x = "Year", y = "Month", fill = "Deviation (°C)",
                title = paste0(
                    "Deviation of the average monthly thermal ",
                    "excursion w.r.t. the ",
                    years[1], "-", years[avg_window], " period"
                ),
                subtitle = station_name
        )

    list(temperature = temperature_plt, excursion = excursion_plt)
}
temp_change_heatmaps(station = "malo", avg_window = 8)
## $temperature

## 
## $excursion

Finally, to better study any significant climatic trend, we choose to take a further step in the averaging, and work with yearly temperatures.

get_yearly_data <- function(station) {
    data <- get_daily_data(station)

    full_years <- data |>
        mutate(year = year(.data$date), month = month(.data$date)) |>
        group_by(year, month) |>
        count() |>
        group_by(year) |>
        filter(n() == 12) |>
        distinct(year) |>
        pull()

    data |>
        mutate(year = year(.data$date)) |>
        filter(year %in% full_years) |>
        group_by(year) |>
        summarize(
            min = mean(.data$min),
            avg = mean(.data$avg),
            max = mean(.data$max)
        )
}
get_yearly_data("malo") |>
    ggplot(aes(x = year)) +
    geom_ribbon(
        aes(ymin = min, ymax = max),
        fill = my_pal[2], alpha = 0.5
    ) +
    geom_line(aes(y = avg), colour = my_pal[2], linewidth = 0.8) +
    labs(
        x = "Year", y = "Temperature (°C)",
        title = "Evolution of yearly temperatures in Malo"
    )

We can see from the plot that the temperature has risen over the past thirty years: in the next section, we will investigate the trend and try to model it using Bayesian regression.

Modelling the yearly temperature trend

If we take a look at the yearly averages…

get_yearly_data("malo") |>
    ggplot(aes(x = year)) +
    geom_line(aes(y = avg), colour = my_pal[2], linewidth = 0.8) +
    labs(
        x = "Year", y = "Temperature (°C)",
        title = "Evolution of the average yearly temperature in Malo"
    )

… we could argue that a linear fit would provide a good description of the data. Of course, the proper course of action should involve a statistical comparison with a null model: in this case, to see if the slope of the linear fit has any statistical significance, we will compare the results with those of a constant regression. Also, to look in the other direction we will analyse a more complex model, a quadratic fit, to see whether more parameters will provide a better fit.

We use JAGS to perform the three regressions, using uniform priors to fully unbias the model comparison.

lin_model <- "
    model {
        tau <- 1 / sigma^2

        for (i in 1:length(year)) {
            mu[i] <- a + b * year[i]
            avg[i] ~ dnorm(mu[i], tau)
        }

        a ~ dnorm(0, 1e-6)
        b ~ dnorm(0, 1e-6)
        sigma ~ dexp(0.0001)
    }"

lin_pars <- c("a", "b", "sigma")

const_model <- "
    model {
        tau <- 1 / sigma^2

        for (i in 1:length(year)) {
            avg[i] ~ dnorm(a, tau);
        }

        a ~ dnorm(0, 1e-6)
        sigma ~ dexp(0.0001);
    }"

const_pars <- c("a", "sigma")

quad_model <- "
    model {
        tau <- 1 / sigma^2

        for (i in 1:length(year)) {
            mu[i] <- a + b * year[i] + c * year[i] * year[i]
            avg[i] ~ dnorm(mu[i], tau)
        }

        a ~ dnorm(0, 1e-6)
        b ~ dnorm(0, 1e-6)
        c ~ dnorm(0, 1e-6)
        sigma ~ dexp(0.0001)
    }"

quad_pars <- c("a", "b", "c", "sigma")

init_variables <- function(model) {
    if (model == lin_model) {
        function() {
            list(
                a = runif(1, -200, 0),
                b = runif(1, 0, 1),
                sigma = runif(1, 0, 1)
            )
        }
    } else if (model == const_model) {
        function() {
            list(
                a = runif(1, -200, 0),
                sigma = runif(1, 0, 1)
            )
        }
    } else if (model == quad_model) {
        function() {
            list(
                a = runif(1, -200, 0),
                b = runif(1, 0, 1),
                c = runif(1, 0, 1),
                sigma = runif(1, 0, 1)
            )
        }
    }
}

regression <- function(station, model, params) {
    R2jags::jags(
        data = get_yearly_data(station),
        inits = init_variables(model),
        parameters.to.save = params,
        model.file = textConnection(model),
        n.chains = 3,
        n.iter = 8e4,
        n.burnin = 2000,
        n.thin = 10,
        DIC = TRUE,
        quiet = TRUE
    )
}

plot_chains <- function(chain, params) {
    trace_plt <- mcmc_trace(
        chain, pars = params,
        facet_args = list(nrow = length(params), labeller = label_parsed)
    ) +
        facet_text(size = 15) +
        labs(title = "Traces") +
        bayesplot::theme_default(base_size = 15, base_family = global_font)

    dens_plt <- mcmc_dens_overlay(
        chain, pars = params,
        facet_args = list(nrow = length(params), labeller = label_parsed)
    ) +
        facet_text(size = 15) +
        labs(title = "Densities") +
        bayesplot::theme_default(base_size = 15, base_family = global_font)

    gridExtra::grid.arrange(trace_plt, dens_plt, ncol = 2)
}

plot_posteriors <- function(chain, pars) {
    bayesplot::mcmc_areas(
        chain, pars = pars, prob = 0.95, point_est = "mean"
    ) +
        bayesplot::theme_default(base_size = 14, base_family = global_font)
}

plot_regression <- function(station, chain, pars) {
    require(dplyr)
    require(purrr)

    data <- get_yearly_data(station)

    n_plt <- 100
    year_plt <- seq(min(data$year), max(data$year), length = n_plt)

    a_smp <- as.vector(chain$BUGSoutput$sims.list$a)
    a_plt <- as.vector(chain$BUGSoutput$mean$a)

    sigma_smp <- map_dbl(
        as.vector(chain$BUGSoutput$sims.list$sigma),
        ~ rnorm(1, mean = 0, sd = .x)
    )

    if (length(pars) == 2) {
        lower_plt <- rep(quantile(a_smp + sigma_smp, prob = 0.025), n_plt)
        upper_plt <- rep(quantile(a_smp + sigma_smp, prob = 0.975), n_plt)

        data <- data |> mutate(plt = a_plt)

    } else if (length(pars) == 3) {
        b_smp <- as.vector(chain$BUGSoutput$sims.list$b)
        b_plt <- as.vector(chain$BUGSoutput$mean$b)

        lower_plt <- map_dbl(
            year_plt,
            ~ quantile(
                a_smp + .x * b_smp + sigma_smp,
                prob = 0.025
            )
        )
        upper_plt <- map_dbl(
            year_plt,
            ~ quantile(
                a_smp + .x * b_smp + sigma_smp,
                prob = 0.975
            )
        )

        data <- data |> mutate(plt = a_plt + b_plt * .data$year)

    } else if (length(pars) == 4) {
        b_smp <- as.vector(chain$BUGSoutput$sims.list$b)
        b_plt <- as.vector(chain$BUGSoutput$mean$b)

        c_smp <- as.vector(chain$BUGSoutput$sims.list$c)
        c_plt <- as.vector(chain$BUGSoutput$mean$c)

        lower_plt <- map_dbl(
            year_plt,
            ~ quantile(
                a_smp + .x * b_smp + .x^2 * c_smp + sigma_smp,
                prob = 0.025
            )
        )
        upper_plt <- map_dbl(
            year_plt,
            ~ quantile(
                a_smp + .x * b_smp + .x^2 * c_smp + sigma_smp,
                prob = 0.975
            )
        )

        data <- data |>
            mutate(plt = a_plt + b_plt * .data$year + c_plt * .data$year^2)
    }

    ggplot(data, aes(x = .data$year)) +
        geom_line(aes(y = .data$avg), colour = my_pal[1], linewidth = 0.8) +
        geom_line(aes(y = .data$plt), colour = my_pal[2], linewidth = 0.8) +
        geom_ribbon(
            aes(x = year_plt, ymin = .data$low, ymax = .data$upp),
            data = tibble(low = lower_plt, upp = upper_plt),
            fill = my_pal[2], alpha = 0.4
        ) +
        labs(
            x = "Year", y = "Temperature (°C)",
            title = paste(
                ifelse(
                    length(pars) == 2, "Constant",
                    ifelse(length(pars) == 3, "Linear", "Quadratic")
                ),
                "regression of the average yearly temperature in",
                print_station(station)
            )
        )
}

Let’s start with the linear model.

lin_chain <- regression("malo", lin_model, lin_pars)
## Warning in jags.model(model.file, data = data, inits = init.values, n.chains =
## n.chains, : Unused variable "min" in data
## Warning in jags.model(model.file, data = data, inits = init.values, n.chains =
## n.chains, : Unused variable "max" in data
summary(lin_chain)
##                    Length Class          Mode     
## model               8     jags           list     
## BUGSoutput         24     bugs           list     
## parameters.to.save  4     -none-         character
## model.file          1     textConnection numeric  
## n.iter              1     -none-         numeric  
## DIC                 1     -none-         logical
lin_mcmc <- as.mcmc(lin_chain)

plot_chains(lin_mcmc, lin_pars)

gridExtra::grid.arrange(
    plot_posteriors(lin_mcmc, "a"),
    plot_posteriors(lin_mcmc, "b"),
    plot_posteriors(lin_mcmc, "sigma"),
    ncol = 3
)

plot_regression("malo", lin_chain, lin_pars)

The credibility interval (considering both the stochasticity from the line parameters and of the Gaussian noise around the line) incorporates well the fitted data: the trend is clearly linear.

Let’s proceed with the other two models, the constant regression first.

const_chain <- regression("malo", const_model, const_pars)
## Warning in jags.model(model.file, data = data, inits = init.values, n.chains =
## n.chains, : Unused variable "min" in data
## Warning in jags.model(model.file, data = data, inits = init.values, n.chains =
## n.chains, : Unused variable "max" in data
summary(const_chain)
##                    Length Class          Mode     
## model               8     jags           list     
## BUGSoutput         24     bugs           list     
## parameters.to.save  3     -none-         character
## model.file          1     textConnection numeric  
## n.iter              1     -none-         numeric  
## DIC                 1     -none-         logical
const_mcmc <- as.mcmc(const_chain)

plot_chains(const_mcmc, const_pars)

gridExtra::grid.arrange(
    plot_posteriors(const_mcmc, "a"),
    plot_posteriors(const_mcmc, "sigma"),
    ncol = 2
)

By inspecting the prosteriors and the chains, the constant model does not seem disastrous: so we need some robust tests to verify whether the linear regression works better (see later).

The a coefficient is in good agreement with the overall mean of the fitted temperatures, as it should be.

a from data
mean sd
13.7 0.64
## Best a from MCMC: 13.7 ± 0.126
plot_regression("malo", const_chain, const_pars)

quad_chain <- regression("malo", quad_model, quad_pars)
## Warning in jags.model(model.file, data = data, inits = init.values, n.chains =
## n.chains, : Unused variable "min" in data
## Warning in jags.model(model.file, data = data, inits = init.values, n.chains =
## n.chains, : Unused variable "max" in data
quad_mcmc <- as.mcmc(quad_chain)

plot_chains(quad_mcmc, quad_pars)

gridExtra::grid.arrange(
    plot_posteriors(quad_mcmc, "a"),
    plot_posteriors(quad_mcmc, "b"),
    plot_posteriors(quad_mcmc, "c"),
    plot_posteriors(quad_mcmc, "sigma"),
    ncol = 2
)

The quadratic fit has a different problem: the posteriors of a, b and c are extremely wide. Let’s check the mean values against the data.

plot_regression("malo", quad_chain, quad_pars)

We get indeed a good fit – in fact, it’s indistinguishable from the linear one – so the posteriors are at least well centred on the right values. The curve is practically a straight line, so we have already a strong hint that the quadratic model brings only unneccesary additional complexity without improving the descriptive power.

Since this quadratic fit was not satisfactory, we can try using more focused priors for the parabola parameters: we choose to use the estimates from a simple lm fit.

lm_fit <- lm(avg ~ year + I(year^2), get_yearly_data("malo"))
lm_params <- summary(lm_fit)$coefficients[, 1]
lm_errs <- summary(lm_fit)$coefficients[, 2]

quad_model_2 <- sprintf("
    model {
        tau <- 1 / sigma^2

        for (i in 1:length(year)) {
            mu[i] <- a + b * year[i] + c * year[i] * year[i]
            avg[i] ~ dnorm(mu[i], tau)
        }

        a ~ dnorm(%e, %e)
        b ~ dnorm(%e, %e)
        c ~ dnorm(%e, %e)
        sigma ~ dexp(0.0001)
    }",
    lm_params[1], 1 / lm_errs[1]^2,
    lm_params[2], 1 / lm_errs[2]^2,
    lm_params[3], 1 / lm_errs[3]^2
)

quad_chain_2 <- regression("malo", quad_model_2, quad_pars)
## Warning in jags.model(model.file, data = data, inits = init.values, n.chains =
## n.chains, : Unused variable "min" in data
## Warning in jags.model(model.file, data = data, inits = init.values, n.chains =
## n.chains, : Unused variable "max" in data
quad_mcmc_2 <- as.mcmc(quad_chain_2)
gridExtra::grid.arrange(
    plot_posteriors(quad_mcmc_2, "a"),
    plot_posteriors(quad_mcmc_2, "b"),
    plot_posteriors(quad_mcmc_2, "c"),
    plot_posteriors(quad_mcmc_2, "sigma"),
    ncol = 2
)

Even in this case, the quadratic parameter c is small and the posteriors are very wide. This is a sign that the likelihood is extremely flat: almost any choice of a and b leads to a good (over)fit.

Since we are drawing samples with a MCMC, we cannot calculate something like the Bayes Factor to compare the models, because we do not have direct access to the likelihood.

A suitable (and commonplace in MCMC analysis) alternative is given by the Deviance Information Criterion (DIC). This quantity is defined starting from the deviance of the posterior,

\[ \overline{D(\theta)} = \mathrm{E}_\theta [D(\theta)] = \mathrm{E}_\theta [-2 \log p(y \,|\, \theta)] \]

where \(y\) stands for the data and \(\theta\) for the model parameters. This is a measure of the goodness-of-fit of the model. JAGS then defines the DIC as

\[ \mathrm{DIC}_{\mathrm{JAGS}} = \overline{D(\theta)} + \frac{1}{2} \mathrm{Var}[D(\theta)]. \]

The second term roughly quantifies the effective number of parameters: the larger it is, the easier it is for the model to fit the data, and so the deviance needs to be penalized.

get_dic <- \(chain) chain$BUGSoutput$DIC

tribble(
    ~ model, ~ DIC,
    "Constant", format(get_dic(const_chain), digits = 3),
    "Linear", format(get_dic(lin_chain), digits = 3),
    "Quadratic", format(get_dic(quad_chain), digits = 3)
) |>
    kableExtra::kbl() |>
    kableExtra::kable_styling(full_width = FALSE)
model DIC
Constant 59.9
Linear 37.9
Quadratic 37.9

The DIC highlights that we probably don’t need a more complex model than the linear one, since the value is exactly the same.

Hypothesis test on the linear model

Now we will check, with an hypothesis test on the posterior drawn from the chain, if the slope of the linear model is significantly greater than zero. So, we set up a null hypotesis \(H_0: b \leq 0\) and test it at a 5% level of significance.

lin_df <- lin_mcmc |> tidybayes::tidy_draws()
sig_level <- quantile(lin_df$b, probs = 0.05)

ggplot(lin_df) +
    geom_density(aes(b), colour = my_pal[1], fill = my_pal[1], alpha = 0.5) +
    geom_area(
        aes(x, y),
        data = tibble(
            x = density(lin_df$b, from = -0.01, to = sig_level)$x,
            y = density(lin_df$b, from = -0.01, to = sig_level)$y),
        colour = my_pal[3], fill = my_pal[3], alpha = 0.5
    ) +
    geom_vline(xintercept = 0, colour = my_pal[2], linewidth = 0.8) +
    coord_cartesian(expand = FALSE) +
    labs(
        x = "Slope of the linear regression (°C/year)",
        y = "Sampled posterior density"
    )

We can comfortably reject the null hypothesis, and state at a 95% level of credibility that the temperatures display a rising trend through the years.

Correlation between the stations

To study the correlations between different locations, we compute the Pearson coefficient between the temperature readings through the years between two given stations.

correlation_plot <- function(stat1, stat2) {
    df <- inner_join(
        get_yearly_data(stat1),
        get_yearly_data(stat2),
        by = "year",
        suffix = c(".s1", ".s2")
    )

    name1 <- print_station(stat1)
    name2 <- print_station(stat2)

    plot <- ggplot(df, aes(x = .data$avg.s1, y = .data$avg.s2)) +
        geom_point(aes(colour = .data$year), size = 3) +
        ggrepel::geom_label_repel(
            aes(label = .data$year, colour = .data$year),
            data = \(x) x |> filter(row_number() %% 3 == 1),
            min.segment.length = 0,
            box.padding = 0.5
        ) +
        labs(
            x = paste(name1, "temperature (°C)"),
            y = paste(name2, "temperature (°C)"),
            colour = "Year",
            title = paste(
                "Comparison of the average yearly temperatures in",
                name1, "and", name2
            )
        )

    list(
        cor = cor(df$avg.s1, df$avg.s2, method = "pearson"),
        plot = plot
    )
}

To give an example, let’s check Porto Tolle and Roverchiara, two low-altitude locations.

por_rov_corr <- correlation_plot("porto_tolle", "roverchiara")

por_rov_corr$cor
## [1] 0.9506927
por_rov_corr$plot

As expected older years (for both locations) are clustered in the colder zone, while more recent years lie in the hotter corner.

We can already appreciate how the points lie close to the diagonal, i.e. the (averaged) temperature readings are consistently similar in similar environments. A more detailed analysis on this phenomenon will be presented in the last chapter.

Now let’s try swapping Porto Tolle for a mountainous location, Agordo. We expect the scatterplot to widen at least a little bit.

aur_rov_corr <- correlation_plot("auronzo", "roverchiara")

aur_rov_corr$cor
## [1] 0.8533582
aur_rov_corr$plot

There are indeed more points far from the diagonal. We will see better the difference in the last chapter.

Comparison with SNPA findings

From now on, we will assume that the best description of the rising temperature trend is the linear model. This time though we will average the temperatures over multi-year intervals, to smooth the plot.

Then we will perform a linear Bayesian regression with JAGS, as before, to estimate the temperature change rate of the minima, averages and maxima, and we will compare them with the ones found in this report from SNPA:

La stima aggiornata del rateo di variazione della temperatura media dal 1981 al 2019 è di +0,38 ± 0,05 °C/10 anni; il rateo di variazione della temperatura massima (+0,42 ± 0,06 °C/10 anni) è maggiore di quello della temperatura minima (+0,34 ± 0,04 °C/10 anni).

snpa <- tibble(
    type = c("min", "avg", "max"),
    mu = c(0.34, 0.38, 0.42),
    sigma = c(0.04, 0.05, 0.06)
)
get_period_data <- function(station, interval) {
    data <- get_yearly_data(station) |>
        filter(.data$year > 1980)
    data <- data[seq(1, nrow(data) - (nrow(data) %% interval)), ]
    first_year <- data$year[1]
    last_year <- data$year[nrow(data)]

    data |>
        mutate(
            period = cut(
                .data$year,
                breaks = seq(first_year, last_year + 1, interval),
                right = FALSE
            )
        ) |>
        group_by(period) |>
        summarize(
            centre = (
                as.integer(substr(as.character(unique(period)), 7, 10)) +
                as.integer(substr(as.character(unique(period)), 2, 5)) - 1
            ) / 2,
            min = mean(.data$min),
            avg = mean(.data$avg),
            max = mean(.data$max)
        )
}
period_regression <- function(data, type) {
    data <- data[, c("centre", type)]

    lm_fit <- lm(as.formula(paste(type, "~ centre")), data)
    lm_pars <- summary(lm_fit)$coefficients[, 1]
    lm_errs <- summary(lm_fit)$coefficients[, 2]

    model <- sprintf("
        model {
            tau <- 1 / sigma^2

            for (i in 1:length(centre)) {
                mu[i] <- a + b * centre[i]
                %s[i] ~ dnorm(mu[i], tau)
            }

            a ~ dnorm(%f, %f)
            b ~ dnorm(%f, %f)
            sigma ~ dexp(0.0001)
        }",
        type,
        lm_pars[1], 1 / lm_errs[1]^2,
        lm_pars[2], 1 / lm_errs[2]^2
    )

    pars <- c("a", "b", "sigma")

    R2jags::jags(
        data = data,
        inits = \() list(a = rnorm(1), b = rnorm(1), sigma = runif(1)),
        parameters.to.save = pars,
        model.file = textConnection(model),
        n.chains = 3,
        n.iter = 30000,
        n.burnin = 2000,
        n.thin = 10,
        DIC = FALSE
    )
}

plot_period_regression <- function(station, type, chain, interval) {
    require(dplyr)
    require(purrr)

    data <- get_period_data(station, interval)

    n_plt <- 100
    year_plt <- seq(min(data$centre), max(data$centre), length = n_plt)

    a_smp <- as.vector(chain$BUGSoutput$sims.list$a)
    a_plt <- as.vector(chain$BUGSoutput$mean$a)

    b_smp <- as.vector(chain$BUGSoutput$sims.list$b)
    b_plt <- as.vector(chain$BUGSoutput$mean$b)

    sigma_smp <- map_dbl(
        as.vector(chain$BUGSoutput$sims.list$sigma),
        ~ rnorm(1, mean = 0, sd = .x)
    )

    lower_plt <- map_dbl(
        year_plt,
        ~ quantile(a_smp + .x * b_smp + sigma_smp, prob = 0.025)
    )
    upper_plt <- map_dbl(
        year_plt,
        ~ quantile(a_smp + .x * b_smp + sigma_smp, prob = 0.975)
    )

    data <- data |> mutate(plt = a_plt + b_plt * .data$centre)

    ggplot(data, aes(x = .data$centre)) +
        geom_line(
            aes(y = .data[[type]]), colour = my_pal[1], linewidth = 0.8
        ) +
        geom_line(
            aes(y = .data$plt), colour = my_pal[2], linewidth = 0.8
        ) +
        geom_ribbon(
            aes(x = year_plt, ymin = .data$low, ymax = .data$upp),
            data = tibble(low = lower_plt, upp = upper_plt),
            fill = my_pal[2], alpha = 0.4
        ) +
        scale_x_continuous(breaks = scales::pretty_breaks()) +
        labs(
            x = "Year", y = "Temperature (°C)",
            title = paste(
                "Linear regression of the yearly",
                ifelse(
                    type == "min", "minimum",
                    ifelse(type == "avg", "average", "maximum")
                ),
                "temperature in",
                print_station(station)
            )
        )
}

Let’s view for example what happens with the Auronzo data, average temperatures.

period_length <- 5
period_data <- get_period_data("auronzo", period_length)
period_chain <- period_regression(period_data, "avg")
## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 7
##    Unobserved stochastic nodes: 3
##    Total graph size: 41
## 
## Initializing model
period_mcmc <- as.mcmc(period_chain)

plot_chains(period_mcmc, lin_pars)

gridExtra::grid.arrange(
    plot_posteriors(period_mcmc, "a"),
    plot_posteriors(period_mcmc, "b"),
    plot_posteriors(period_mcmc, "sigma"),
    ncol = 2
)

plot_period_regression("auronzo", "avg", period_chain, period_length)

snpa_comparison <- function(station, type, interval) {
    period_data <- get_period_data(station, interval)
    period_chain <- period_regression(period_data, type)
    chain_pars <- list(
        mu = as.vector(period_chain$BUGSoutput$mean$b),
        sigma = as.vector(period_chain$BUGSoutput$sd$b)
    )

    mu_d <- chain_pars$mu - snpa$mu[snpa$type == type] / 10
    sigma_d <- sqrt(
        chain_pars$sigma^2 + (snpa$sigma[snpa$type == type] / 10)^2
    )

    cred <- qnorm(c(0.025, 0.975), mu_d, sigma_d)

    x <- mu_d + seq(-5 * sigma_d, 5 * sigma_d, length.out = 1000)

    tibble(x = x, y = dnorm(x, mu_d, sigma_d)) |>
        ggplot(aes(.data$x, .data$y)) +
            geom_area(fill = my_pal[1], alpha = 0.5) +
            geom_area(
                data = \(x) x |> filter(x < cred[1]),
                fill = my_pal[3], alpha = 0.5
            ) +
            geom_area(
                data = \(x) x |> filter(x > cred[2]),
                fill = my_pal[3], alpha = 0.5
            ) +
            geom_vline(
                xintercept = 0, colour = my_pal[2], linewidth = 0.8
            ) +
            coord_cartesian(expand = FALSE) +
            labs(
                x = "Difference of the posterior means (°C/year)",
                y = "Posterior distribution",
                title = paste(
                    "Compatibility test of the",
                    ifelse(
                        type == "min", "minimum",
                        ifelse(type == "max", "maximum", "average")
                    ),
                    "temperatures’ yearly increase rates"
                ),
                subtitle = paste(
                    "Difference between", print_station(station),
                    "and SNPA data"
                )
            )
}

To compare the two measurements, we approximate the posterior of the slope parameters with a normal distribution, and we assume that the SNPA value comes from a normal distribution as well. We can then perform an hypothesis test putting as null hypothesis that the difference between the two measurements is different from 0: \(H_0: \mu_{\mathrm{post}} - \mu_{\mathrm{SNPA}} = 0\).

snpa_comparison("auronzo", "min", period_length)
## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 7
##    Unobserved stochastic nodes: 3
##    Total graph size: 41
## 
## Initializing model

snpa_comparison("auronzo", "avg", period_length)
## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 7
##    Unobserved stochastic nodes: 3
##    Total graph size: 41
## 
## Initializing model

snpa_comparison("auronzo", "max", period_length)
## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 7
##    Unobserved stochastic nodes: 3
##    Total graph size: 41
## 
## Initializing model

With the maxima and averages, \(0\) lies in the acceptance zone, so we can say that the two measurements are compatible at a 95% credibility level. In the case of the minima the \(0\) lies in the rejection zone, so the test suggests a 5% significant difference in our findings with respect to the SNPA ones.

Predictive analysis of the time series

ARIMA (Autoregressive Integrated Moving Average) models provide an approach to time series forecasting with the aim to describe the autocorrelations in the data.

These models are based on:

  • Differencing: change between consecutive observations in the original series

    \[ y'_t = y_t - y_{t-1} \]

  • Autoregression: forecast of the variable of interest using a linear combination of past values of the variable

    \[ y_t = c + \phi_1 y_{t-1} + \phi_2 y_{t-2} + \cdots + \phi_p y_{t-p} + \epsilon_t \]

where \(\epsilon_t\) is normally distributed white noise with mean zero and variance one.

  • Moving average: past forecast errors in a regression-like model

    \[ y_t = c + \epsilon_t + \theta_1 \epsilon_{t-1} + \theta_2 \epsilon_{t-2} + \cdots + \theta_q \epsilon_{t-q} \]

Combining these three elements we get a non-seasonal ARIMA model:

\[ y'_t = c + \phi_1 y'_{t-1} + \phi_2 y'_{t-2} + \cdots + \phi_p y'_{t-p} + \epsilon_t + \theta_1 \epsilon_{t-1} + \theta_2 \epsilon_{t-2} + \cdots + \theta_q \epsilon_{t-q} \]

where we denote with \(p\) the order of the autoregressive part, \(d\) the differencing degree and \(q\) the order of the moving average part.

We now follow this steps to fit our data with ARIMA: if the data are non-stationary, we take first differences of the data until the data are stationary; then, we choose some reasonable models and see for which of them the AIC is lower, and we examine the ACF of the residuals to see if the trend is compatible with white noise. Finally, we evaluate the forecast.

library(forecast)

get_diff_data <- function(station) {
    get_yearly_data(station) |>
        reframe(
            year = year,
            diff_avg = c(0, diff(.data$avg)),
            diff_min = c(0, diff(.data$min)),
            diff_max = c(0, diff(.data$max))
        ) |>
        tail(-1)
}

forecast_prediction <- function(
    data_type, station, pred_years, interv = 5
) {
    if (data_type == "diff") {
        df <- get_diff_data(station)
        time_series <- ts(
            data = df$diff_avg,
            frequency = 1,
            start = df$year[1]
        )

        aa <- Arima(time_series, order = c(5, 0, 5))
        fc <- forecast(aa, level = 95, h = pred_years)

    } else if (data_type == "yearly") {
        df <- get_yearly_data(station)
        time_series <- ts(
            data = df$avg,
            frequency = 1,
            start = df$year[1]
        )

        aa <- Arima(time_series, order = c(3, 1, 4))
        fc <- forecast(aa, level = c(95), h = pred_years)

    } else if (data_type == "period") {
        df <- get_period_data(station, interv)
        time_series <- ts(
            data = df$avg,
            frequency = 1 / interv,
            start = df$centre[1]
        )

        aa <- Arima(time_series, order = c(1, 1, 1))
        fc <- forecast(aa, level = 95, h = as.integer(pred_years / interv))

    } else if (data_type == "monthly") {
        df <- get_monthly_data(station)
        time_series <- ts(
            data = df$avg,
            frequency = 12,
            start = c(df$year, as.numeric(df$month[1]))
        )

        aa <- Arima(
            time_series,
            order = c(3, 1, 3),
            seasonal = c(1, 1, 2),
            lambda = 0
        )
        fc <- forecast(aa, level = 95, h = 12 * pred_years)
    }

    fc_plt <- gridExtra::grid.arrange(
        autoplot(time_series) +
            theme_minimal(base_size = 8, base_family = global_font) +
            labs(
                x = "Time",
                y = paste(
                    ifelse(
                        data_type == "min", "Minimum",
                        ifelse(data_type == "avg", "Average", "Maximum")
                    ),
                    "temperatures (°C)"
                ),
                title = "Time series"
            ),
        ggAcf(time_series) +
            theme_minimal(base_size = 8, base_family = global_font) +
            labs(x = "Lag", y = "ACF", title = "ACF of the time series"),
        ggAcf(aa$residuals) +
            theme_minimal(base_size = 8, base_family = global_font) +
            labs(x = "Lag", y = "ACF", title = "ACF of the residuals"),
        autoplot(fc) +
            theme_minimal(base_size = 8, base_family = global_font) +
            labs(
                x = "Time",
                y = paste(
                    ifelse(
                        data_type == "min", "Minimum",
                        ifelse(data_type == "avg", "Average", "Maximum")
                    ),
                    "temperatures (°C)"
                ),
                title = "Time series plus forecast"
            ),
        ncol = 2
    )

    list(plot = fc_plt, fc = fc)
}
forecast_prediction("diff", "malo", 10)$plot

## TableGrob (2 x 2) "arrange": 4 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
forecast_prediction("yearly", "malo", 10)$plot

## TableGrob (2 x 2) "arrange": 4 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
forecast_prediction("period", "malo", 10)$plot

## TableGrob (2 x 2) "arrange": 4 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (1-1,2-2) arrange gtable[layout]
## 3 3 (2-2,1-1) arrange gtable[layout]
## 4 4 (2-2,2-2) arrange gtable[layout]
diff_unrolling <- function(stat, py) {
    malo_pred <- forecast_prediction(
        data_type = "diff", station = stat, pred_years = py
    )$fc

    yearly_data <- get_yearly_data(stat)
    n_years <- nrow(yearly_data)

    start_year <- yearly_data$year[n_years]
    start_temp <- yearly_data$avg[n_years]

    pred_df <- data.frame(
        x = seq(start_year + 1, start_year + py),
        y = start_temp + cumsum(malo_pred$mean)
    )

    ggplot(yearly_data) +
        geom_line(
            aes(x = .data$year, y = .data$avg),
            colour = my_pal[1], linewidth = 0.8
        ) +
        geom_line(
            aes(x = .data$x, y = .data$y), data = pred_df,
            colour = my_pal[3], linewidth = 0.8
        ) +
        labs(
            x = "Year", y = "Temperature (°C)",
            title = paste(
                "Average temperature prediction for",
                print_station(stat)
            )
        )
}

diff_unrolling("auronzo", 15)